#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#       dse_intraday.maxpy
#
#  This file is part of the MaxTrader Project. http://www.maxtraderbd.com/
#
#  Copyright (c) 2011 invarBrass, MaxTrader Software Services <info@maxtraderbd.com>
#  Portions Copyright (c) 2010, M Nasimul Haque
#  
#  All rights reserved.
#
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions are met:
#      * Redistributions of source code must retain the above copyright
#        notice, this list of conditions and the following disclaimer.
#      * Redistributions in binary form must reproduce the above copyright
#        notice, this list of conditions and the following disclaimer in the
#        documentation and/or other materials provided with the distribution.
#      * Neither the name of the MaxTrader Software Services nor the
#        names of its contributors may be used to endorse or promote products
#        derived from this software without specific prior written permission.
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#  DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# $Author: masroore@gmail.com $
# $Id: dse_intraday.py 22 2011-10-01 15:33:48Z masroore@gmail.com $
# $Rev: 22 $

import re, string
from BeautifulSoup import BeautifulSoup
from pipeline import *

TICKER, LTP, HIGH, LOW, CLOSE, YCP, CHANGE, TRADE, VOLUME = range(1, 10)

class DseScraperContext(PipelineContext):
    
    def __init__(self, raw_data):
        self.raw_data = raw_data
        self.sanitized_data = raw_data
        self.filter_nontraded_stocks = True

    def reset(self):
        self.nontraded_stocks = 0
        self.raw_quotes = []
        self.validated_quotes = []
        self.table_rows = []
        self.soup = None
        self.column_indices = {}
        for i in range(TICKER, VOLUME + 1):
            self.column_indices[i] = -1

class HtmlSanitizationStage(Stage):
    
    _tidy_patterns = {
        '<body>': re.compile(r'<body[^>]*>', re.I),
        '<tr>': re.compile(r'<tr[^>]*>', re.I),
        '<td>': re.compile(r'<td[^>]*>', re.I),
        '<a href="#">': re.compile(r'<a\s+href=[^>]*>', re.I),
        '<font>': re.compile(r'<font[^>]*>', re.I),
        '<!-- # -->': re.compile(r'<link[^>]*>', re.I),
        ' ': re.compile(r'[\r\n]')
    }
    
    def execute(self, context):
        html = context.raw_data
        for tag, rx in self._tidy_patterns.items():
            html = rx.sub(tag, html)
        
        context.sanitized_data = html
        #html = "".join([s for s in html.splitlines() if s.strip()])
        #tidy = lambda s: string.join(filter(string.strip, re.split(r'[\r\n]+', s)), '\n')
        #html = tidy(html)

class SoupifyStage(Stage):
    
    def execute(self, context):
        context.reset()
        context.soup = BeautifulSoup(context.sanitized_data)
        context.table_rows = context.soup.body.table.findAll('tr')

class FieldsDiscoveryStage(Stage):
    
    _column_patterns = {
        TICKER: re.compile(r'\bcode\b', re.I),
        LTP: re.compile(r'\bltp\b', re.I),
        HIGH: re.compile(r'\bhigh\b', re.I),
        LOW: re.compile(r'\blow\b', re.I), 
        CLOSE: re.compile(r'\bclose\b', re.I),
        YCP: re.compile(r'\bycp\b', re.I),
        CHANGE: re.compile(r'\bchange\b', re.I),
        TRADE: re.compile(r'\btrade\b', re.I), 
        VOLUME: re.compile(r'\bvolume\b', re.I)
    }

    def execute(self, context):
        header_row = context.table_rows[0]
        cells = header_row.findAll('b') #.findAll('b')
        for col_ix, cell in enumerate(cells):
            vv = str(cell.contents[0])
            text = cell.find(text=True)
            for i in range(TICKER, VOLUME + 1):
                if self._column_patterns[i].search(text) != None:
                    context.column_indices[i] = col_ix + 1
                    break

class QuotesScrapingStage(Stage):
    
    def execute(self, context):
        rows = context.table_rows[1:]

        for row in rows:
            cells = row.findAll('td') #[1:]
            quote = {}
            for field, index in context.column_indices.items():
                if field == TICKER:
                    quote[field] = cells[index].a.contents[0].strip()
                else:
                    quote[field] = str(cells[index].find(text=True)).strip()
                
            context.raw_quotes.append(quote)

class QuotesValidationStage(Stage):
    
    def _sanitize(self, quote):
        result = {}
        for i in range(TICKER, VOLUME + 1):
            if i == TICKER:
                result[i] = quote[i].upper()
            elif i >= LTP and i <= CHANGE:
                value = float(quote[i])
                if i != CHANGE: 
                    value = value if value >= 0 else 0
                result[i] = value
            else:
                value = int(quote[i])
                result[i] = value if value >= 0 else 0

        return result

    def execute(self, context):
        validated_quotes = []
        for quote in context.raw_quotes:
            quote = self._sanitize(quote)
            if (quote[TRADE] == 0) and context.filter_nontraded_stocks:
                context.nontraded_stocks += 1
            else:
                validated_quotes.append(quote)
        
        context.validated_quotes = validated_quotes

class DsebdIntradayScraper(object):
    
    __pipeline = None
    
    def __init__(self):
        self.__pipeline = SequentialPipeline()
        self.__pipeline.add_stage(HtmlSanitizationStage())
        self.__pipeline.add_stage(SoupifyStage())
        self.__pipeline.add_stage(FieldsDiscoveryStage())
        self.__pipeline.add_stage(QuotesScrapingStage())
        self.__pipeline.add_stage(QuotesValidationStage())
        
    def execute(self, html, filter_nontraded_stocks = False):
        context = DseScraperContext(html)
        context.filter_nontraded_stocks = filter_nontraded_stocks
        
        self.__pipeline.execute(context)

        if context.has_errors():
            raise Exception(context.error_messages())
        
        return context.validated_quotes
